* Project      :  The Full, Persistent, and Symmetric Pass-Through of a Temporary VAT Cut
* Authors      :  Márcia Silva-Pereira, João Quelhas, Tiago Bernardino, Ricardo Duque Gabriel
* Date         :  02/04/2025
* Description  :  Robustness - Data Cleaning Methods
* Dependencies :
* Modifications: (add date, author and change)

*********************************************************

* TABLE C.3: Robustness: Data-Cleaning Methods
*********************************************************

*********************************************************
*                 Construct the datasets                *
*********************************************************

// Load the cleaned dataset.
use "${path_work}/auxiliar/combined_dataset.dta", clear

// Fill in missing values using time-series fill.
tsfill

// Save the relevant variables.
keep id_sm id ean13 date name brand supermarket price treatment ecoicop

local max_imputar = 7

// Loop through the variables and replace missing values
if `max_imputar' > 0 {

	forval imputar_dias = 1/`max_imputar' {
		
		foreach v of varlist id ean13 date name brand supermarket price treatment ecoicop {
			
			// Replace missing values with the previous value if conditions are met
			
			by id_sm (date): replace `v' = `v'[_n-1] if missing(`v') & `v'[_n-1] == `v'[_n + `imputar_dias'] & _n > 1 & !missing(`v'[_n-1]) & _n < _N
		}
	}
	
		
	forval imputar_dias = 1/`max_imputar' {
	
		foreach v of varlist id ean13 date name brand supermarket price treatment ecoicop {
		
			bys id_sm (date): replace `v' = `v'[_n + `imputar_dias'] if missing(`v') & _n < `max_imputar'+2-`imputar_dias'
		}
	}
		
	forval imputar_dias = 1/`max_imputar' {
		
		foreach v of varlist id ean13 date name brand supermarket price treatment ecoicop {
		
			bys id_sm (date): replace `v'  =`v'[_n - `imputar_dias'] if missing(`v') & _n > _N-`max_imputar'+`imputar_dias'-1
		}
	}
}

// (1) Construction of relevant variables.

// Compute a price index with base in the day before policy announcement I.
local   base_date 23/03/2023
egen    base_price_23032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_23032023 = (price / base_price_23032023) * 100

// Compute a price index with base in the day before policy announcement II.
local   base_date 26/03/2023
egen    base_price_26032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_26032023 = (price / base_price_26032023) * 100

// Compute a price index with base in the day before policy implementation.
local   base_date 17/04/2023
egen    base_price_17042023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_17042023 = (price / base_price_17042023) * 100

// Compute a price index with base in the day before the end of the policy.
local   base_date 04/01/2024
egen    base_price_04012024 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_04012024 = (price / base_price_04012024) * 100

drop base_price_23032023 base_price_26032023 base_price_17042023 base_price_04012024

// Rename variable ecoicop and identify the ECOICOP 4-digit classification.
rename ecoicop ecoicop5

gen ecoicop4 = .
replace ecoicop4 = 0111 if ecoicop5 == 01111 | ecoicop5 == 01113 | ecoicop5 == 01116
replace ecoicop4 = 0112 if ecoicop5 == 01121 | ecoicop5 == 01122 | ecoicop5 == 01124
replace ecoicop4 = 0113 if ecoicop5 == 01131 | ecoicop5 == 01132 
replace ecoicop4 = 0114 if ecoicop5 == 01141 | ecoicop5 == 01142 | ecoicop5 == 01144 | ///
			   ecoicop5 == 01145 | ecoicop5 == 01146 | ecoicop5 == 01147
replace ecoicop4 = 0115 if ecoicop5 == 01151 | ecoicop5 == 01153
replace ecoicop4 = 0116 if ecoicop5 == 01161
replace ecoicop4 = 0117 if ecoicop5 == 01171 | ecoicop5 == 01172 | ecoicop5 == 01174

// Identify products imported (0) and produced in Portugal (1).
gen domestic     = 1 if real(substr(string(ean13, "%14.0f"), 1, 3)) == 560
replace domestic = 0 if (real(substr(string(ean13, "%14.0f"), 1, 3)) != 560 & ean13 != .)

drop ean13

// Identify products from trademark-labels (0) and white-labels (1).
gen white_label = 0
foreach expr in "continente" "auchan" "pingo" "aldi" {
	replace white_label = 1 if strpos(brand, "`expr'") > 0
	}

drop brand

// Identify products from small supermarkets (0) and big supermarkets (1).
gen big = 1 if supermarket == "CONTINENTE" | supermarket == "PINGODOCE"
replace big = 0 if big == .

save "${path_work}/auxiliar/tempfile.dta", replace

// Identify products with price below average (0) and above average (1).
keep if date >= td(01/01/2023) & date < td(01/03/2023)
egen average_price = mean(price), by(id_sm)
keep id_sm average_price
duplicates drop id_sm, force
save "${path_work}/auxiliar/average_price.dta", replace

use "${path_work}/auxiliar/tempfile.dta"

merge m:1 id_sm using "${path_work}/auxiliar/average_price.dta"
drop _merge
sum average_price if treatment == 1, d
return list
gen high_price = 1 if average_price >= `r(mean)' 
replace high_price = 0 if average_price < `r(mean)'

erase "${path_work}/auxiliar/tempfile.dta"

*********************************************************

// (2) Flag products that were not certain to be treated (c.f. Portal das Finanças and Excel VAT_Doubts)

// Generate a new variable 'vat_doubt' and initialize it to missing values.
gen vat_doubt = .

// Loop through each product in the 'doubt' global list.
foreach expr in nabo couve carapau sangue meloa melancia sumo nectar vitelo vitela vitelao novilha novilho boi leitao bacalhau barrigas linguas cabecas caras abas galinha carcaca carcaça espetada espetadas hamburguer tostas panado panada nugget requeijao {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

// Loop through specific products to be flagged for further investigation.
foreach expr in "massa instantanea" "tomate seco" "tomate desidratado" "arroz vaporizado" "batata doce" "leite infantil" "leite crianca" "batata congelada" "cebola chalota" "ervilha torta" "manteiga alho" "manteiga ervas" "manteiga amendoim" "conserva atum tomate" "conserva atum escabeche" "conserva atum tomate" "conserva atum algarvia" "conserva atum picante" "conserva atum caldeirada" "conserva peixe" "mini tomate cherry" "arroz basmati" "arroz jasmim" "rolo de carne" {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

*********************************************************

// (3) Correct measurement errors.

// Change treatment == 1 as these are measurement errors: products classified as control
// in a COICOP 5 that has only treated products.
replace treatment = 1 if ecoicop5 == 1111 | ecoicop5 == 1113 | ecoicop5 == 1121 | ///
			 ecoicop5 == 1122 | ecoicop5 == 1135 | ecoicop5 == 1141 | ///
			 ecoicop5 == 1142 | ecoicop5 == 1144 | ecoicop5 == 1145 | ///
			 ecoicop5 == 1147 | ecoicop5 == 1151 | ecoicop5 == 1153 | ///
			 ecoicop5 == 1154
		
replace treatment = 0 if ecoicop5 == 01112 | ecoicop5 == 01115 | ecoicop5 == 01117 | ///
			 ecoicop5 == 01123 | ecoicop5 == 01125 | ecoicop5 == 01126 | ///
			 ecoicop5 == 01127 | ecoicop5 == 01133 | ecoicop5 == 01134 | ///
			 ecoicop5 == 01143 | ecoicop5 == 01152 | ecoicop5 == 01155 | ///
			 ecoicop5 == 01162 | ecoicop5 == 01163 | ecoicop5 == 01164 | ///
			 ecoicop5 == 01175 | ecoicop5 == 01118 | ecoicop5 == 01182 | ///
			 ecoicop5 == 01184 | ecoicop5 == 01185 | ecoicop5 == 01186 | ///
			 ecoicop5 == 01191 | ecoicop5 == 01192 | ecoicop5 == 01193 | ///
			 ecoicop5 == 01194 | ecoicop5 == 01195 | ecoicop5 == 01211 | ///
			 ecoicop5 == 01212 | ecoicop5 == 01213 | ecoicop5 == 01221 | ///
			 ecoicop5 == 01222 | ecoicop5 == 01223 | ecoicop5 == 02111 | ///
			 ecoicop5 == 02112 | ecoicop5 == 02121 | ecoicop5 == 02122 | ///
			 ecoicop5 == 02123 | ecoicop5 == 02124 | ecoicop5 == 02131 | ///
			 ecoicop5 == 02132 | ecoicop5 == 02133 | ecoicop5 == 02134 | ///
			 ecoicop5 == 01181 | ecoicop5 == 01183

*********************************************************

// (4) Add usable time variable to the dataset and set it as a panel.


// Merge datasets based on the 'date' variable using the "weeks.dta" file.
merge m:1 date using "${path_work}/source/weeks.dta"
drop _merge

gen year = year(date)

// Generate a new variable 'week_year' representing the combination of year and week.
gen week_year = yw(year(date), week_number)

// Drop the original 'week' variable.
drop week_number

// Format 'week_year' as a Stata weekly date.
format week_year %tw

// Create a new variable 'month_year' representing the combination of year and month.
gen month_year = ym(year(date), month(date))

// Format 'month_year' as a Stata monthly date.
format month_year %tm

// Create a new variable 'day' with the day of the observation.
generate day = day(date)

// (5) Add weights variable.

// Merge datasets based on 'ecoicop' and 'treatment'.
merge m:1 ecoicop5 treatment using "${path_work}/source/ecoicop_categories.dta"
bys ecoicop5 treatment date: gen frequency = _N
gen weight = weight_2023 / frequency
replace weight = 0 if _merge == 1
drop _merge

// Set the data as a panel with daily observations.
xtset id_sm date, daily

erase "${path_work}/auxiliar/average_price.dta"

*********************************************************

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

* Exclude all drinks.
drop if ecoicop5 == 01211 | ecoicop5 == 01212 | ecoicop5 == 01213 | ///
	ecoicop5 == 01221 | ecoicop5 == 01222 | ecoicop5 == 01223 | ///
	ecoicop5 == 02111 | ecoicop5 == 02112 | ecoicop5 == 02121 | ///
	ecoicop5 == 02122 | ecoicop5 == 02123 | ecoicop5 == 02124 | ///
	ecoicop5 == 02131 | ecoicop5 == 02132 | ecoicop5 == 02133 | ///
	ecoicop5 == 02134
	
// Save the updated dataset.
save "${path_work}/auxiliar/imputation_1.dta", replace

*********************************************************


// Load the cleaned dataset.
use "${path_work}/auxiliar/combined_dataset.dta", clear

// Fill in missing values using time-series fill.
tsfill

// Save the relevant variables.
keep id_sm id ean13 date name brand supermarket price treatment ecoicop

// Loop through the variables and replace missing values

foreach v of varlist id ean13 date name brand supermarket price treatment ecoicop {
			
	// Replace missing values with the previous value if conditions are met.
		
	by id_sm (date): replace `v' = `v'[_n-1] if missing(`v') & _n > 1 & !missing(`v'[_n-1]) & _n < _N

	}

// (1) Construction of relevant variables.

// Compute a price index with base in the day before policy announcement I.
local   base_date 23/03/2023
egen    base_price_23032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_23032023 = (price / base_price_23032023) * 100

// Compute a price index with base in the day before policy announcement II.
local   base_date 26/03/2023
egen    base_price_26032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_26032023 = (price / base_price_26032023) * 100

// Compute a price index with base in the day before policy implementation.
local   base_date 17/04/2023
egen    base_price_17042023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_17042023 = (price / base_price_17042023) * 100

// Compute a price index with base in the day before the end of the policy.
local   base_date 04/01/2024
egen    base_price_04012024 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_04012024 = (price / base_price_04012024) * 100

drop base_price_23032023 base_price_26032023 base_price_17042023 base_price_04012024  

// Rename variable ecoicop and identify the ECOICOP 4-digit classification.
rename ecoicop ecoicop5

gen ecoicop4 = .
replace ecoicop4 = 0111 if ecoicop5 == 01111 | ecoicop5 == 01113 | ecoicop5 == 01116
replace ecoicop4 = 0112 if ecoicop5 == 01121 | ecoicop5 == 01122 | ecoicop5 == 01124
replace ecoicop4 = 0113 if ecoicop5 == 01131 | ecoicop5 == 01132 
replace ecoicop4 = 0114 if ecoicop5 == 01141 | ecoicop5 == 01142 | ecoicop5 == 01144 | ///
			   ecoicop5 == 01145 | ecoicop5 == 01146 | ecoicop5 == 01147
replace ecoicop4 = 0115 if ecoicop5 == 01151 | ecoicop5 == 01153
replace ecoicop4 = 0116 if ecoicop5 == 01161
replace ecoicop4 = 0117 if ecoicop5 == 01171 | ecoicop5 == 01172 | ecoicop5 == 01174

// Identify products imported (0) and produced in Portugal (1).
gen domestic     = 1 if real(substr(string(ean13, "%14.0f"), 1, 3)) == 560
replace domestic = 0 if (real(substr(string(ean13, "%14.0f"), 1, 3)) != 560 & ean13 != .)

drop ean13

// Identify products from trademark-labels (0) and white-labels (1).
gen white_label = 0
foreach expr in "continente" "auchan" "pingo" "aldi" {
	replace white_label = 1 if strpos(brand, "`expr'") > 0
	}

drop brand

// Identify products from small supermarkets (0) and big supermarkets (1).
gen big = 1 if supermarket == "CONTINENTE" | supermarket == "PINGODOCE"
replace big = 0 if big == .

save "${path_work}/auxiliar/tempfile.dta", replace

// Identify products with price below average (0) and above average (1).
keep if date >= td(01/01/2023) & date < td(01/03/2023)
egen average_price = mean(price), by(id_sm)
keep id_sm average_price
duplicates drop id_sm, force
save "${path_work}/auxiliar/average_price.dta", replace

use "${path_work}/auxiliar/tempfile.dta"

merge m:1 id_sm using "${path_work}/auxiliar/average_price.dta"
drop _merge
sum average_price if treatment == 1, d
return list
gen high_price = 1 if average_price >= `r(mean)' 
replace high_price = 0 if average_price < `r(mean)'

erase "${path_work}/auxiliar/tempfile.dta"

*********************************************************

// (2) Flag products that were not certain to be treated (c.f. Portal das Finanças and Excel VAT_Doubts)

// Generate a new variable 'vat_doubt' and initialize it to missing values.
gen vat_doubt = .

// Loop through each product in the 'doubt' global list.
foreach expr in nabo couve carapau sangue meloa melancia sumo nectar vitelo vitela vitelao novilha novilho boi leitao bacalhau barrigas linguas cabecas caras abas galinha carcaca carcaça espetada espetadas hamburguer tostas panado panada nugget requeijao {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

// Loop through specific products to be flagged for further investigation.
foreach expr in "massa instantanea" "tomate seco" "tomate desidratado" "arroz vaporizado" "batata doce" "leite infantil" "leite crianca" "batata congelada" "cebola chalota" "ervilha torta" "manteiga alho" "manteiga ervas" "manteiga amendoim" "conserva atum tomate" "conserva atum escabeche" "conserva atum tomate" "conserva atum algarvia" "conserva atum picante" "conserva atum caldeirada" "conserva peixe" "mini tomate cherry" "arroz basmati" "arroz jasmim" "rolo de carne" {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

*********************************************************

// (3) Correct measurement errors.

// Change treatment == 1 as these are measurement errors: products classified as control
// in a COICOP 5 that has only treated products.
replace treatment = 1 if ecoicop5 == 1111 | ecoicop5 == 1113 | ecoicop5 == 1121 | ///
			 ecoicop5 == 1122 | ecoicop5 == 1135 | ecoicop5 == 1141 | ///
			 ecoicop5 == 1142 | ecoicop5 == 1144 | ecoicop5 == 1145 | ///
			 ecoicop5 == 1147 | ecoicop5 == 1151 | ecoicop5 == 1153 | ///
			 ecoicop5 == 1154
		
replace treatment = 0 if ecoicop5 == 01112 | ecoicop5 == 01115 | ecoicop5 == 01117 | ///
			 ecoicop5 == 01123 | ecoicop5 == 01125 | ecoicop5 == 01126 | ///
			 ecoicop5 == 01127 | ecoicop5 == 01133 | ecoicop5 == 01134 | ///
			 ecoicop5 == 01143 | ecoicop5 == 01152 | ecoicop5 == 01155 | ///
			 ecoicop5 == 01162 | ecoicop5 == 01163 | ecoicop5 == 01164 | ///
			 ecoicop5 == 01175 | ecoicop5 == 01118 | ecoicop5 == 01182 | ///
			 ecoicop5 == 01184 | ecoicop5 == 01185 | ecoicop5 == 01186 | ///
			 ecoicop5 == 01191 | ecoicop5 == 01192 | ecoicop5 == 01193 | ///
			 ecoicop5 == 01194 | ecoicop5 == 01195 | ecoicop5 == 01211 | ///
			 ecoicop5 == 01212 | ecoicop5 == 01213 | ecoicop5 == 01221 | ///
			 ecoicop5 == 01222 | ecoicop5 == 01223 | ecoicop5 == 02111 | ///
			 ecoicop5 == 02112 | ecoicop5 == 02121 | ecoicop5 == 02122 | ///
			 ecoicop5 == 02123 | ecoicop5 == 02124 | ecoicop5 == 02131 | ///
			 ecoicop5 == 02132 | ecoicop5 == 02133 | ecoicop5 == 02134 | ///
			 ecoicop5 == 01181 | ecoicop5 == 01183

*********************************************************

// (4) Add usable time variable to the dataset and set it as a panel.


// Merge datasets based on the 'date' variable using the "weeks.dta" file.
merge m:1 date using "${path_work}/source/weeks.dta"
drop _merge

gen year = year(date)

// Generate a new variable 'week_year' representing the combination of year and week.
gen week_year = yw(year(date), week_number)

// Drop the original 'week' variable.
drop week_number

// Format 'week_year' as a Stata weekly date.
format week_year %tw

// Create a new variable 'month_year' representing the combination of year and month.
gen month_year = ym(year(date), month(date))

// Format 'month_year' as a Stata monthly date.
format month_year %tm

// Create a new variable 'day' with the day of the observation.
generate day = day(date)

// (5) Add weights variable.

// Merge datasets based on 'ecoicop' and 'treatment'.
merge m:1 ecoicop5 treatment using "${path_work}/source/ecoicop_categories.dta"
bys ecoicop5 treatment date: gen frequency = _N
gen weight = weight_2023 / frequency
replace weight = 0 if _merge == 1
drop _merge

// Set the data as a panel with daily observations.
xtset id_sm date, daily

erase "${path_work}/auxiliar/average_price.dta"

*********************************************************

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

* Exclude all drinks.
drop if ecoicop5 == 01211 | ecoicop5 == 01212 | ecoicop5 == 01213 | ///
	ecoicop5 == 01221 | ecoicop5 == 01222 | ecoicop5 == 01223 | ///
	ecoicop5 == 02111 | ecoicop5 == 02112 | ecoicop5 == 02121 | ///
	ecoicop5 == 02122 | ecoicop5 == 02123 | ecoicop5 == 02124 | ///
	ecoicop5 == 02131 | ecoicop5 == 02132 | ecoicop5 == 02133 | ///
	ecoicop5 == 02134
	
// Save the updated dataset.
save "${path_work}/auxiliar/imputation_2.dta", replace

*********************************************************


// Load the cleaned dataset.
use "${path_work}/auxiliar/combined_dataset.dta", clear

keep if date >= td(01/12/2022)

unique id_sm

// Set the panel structure.
xtset id_sm date, daily

// Create a new variable 'total' containing the count of daily observations for each 'id_sm'.
bys id_sm: gen total = _N

// Summarize the 'total' variable to find the maximum count of daily observations.
sum total

// Keep only the observations where 'total' is equal to the maximum count (r(max)).
keep if total > 450 & total < 456

// Fill in missing values using time-series fill.
tsfill

// Save the relevant variables.
keep id_sm id ean13 date name brand supermarket price treatment ecoicop

// Loop through the variables and replace missing values
foreach v of varlist id ean13 date name brand supermarket price treatment ecoicop {
			
	// Replace missing values with the previous value if conditions are met.
		
	by id_sm (date): replace `v' = `v'[_n-1] if missing(`v') & _n > 1 & !missing(`v'[_n-1]) & _n < _N
}

// Set the panel structure.
xtset id_sm date, daily

// Create a new variable 'total' containing the count of daily observations for each 'id_sm'.
bys id_sm: gen total = _N

// Summarize the 'total' variable to find the maximum count of daily observations.
sum total

// Keep only the observations where 'total' is equal to the maximum count (r(max)).
keep if total == r(max)

*********************************************************

// (1) Construction of relevant variables.

// Compute a price index with base in the day before policy announcement I.
local   base_date 23/03/2023
egen    base_price_23032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_23032023 = (price / base_price_23032023) * 100

// Compute a price index with base in the day before policy announcement II.
local   base_date 26/03/2023
egen    base_price_26032023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_26032023 = (price / base_price_26032023) * 100

// Compute a price index with base in the day before policy implementation.
local   base_date 17/04/2023
egen    base_price_17042023 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_17042023 = (price / base_price_17042023) * 100

// Compute a price index with base in the day before the end of the policy.
local   base_date 04/01/2024
egen    base_price_04012024 = total(price / (date == td(`base_date'))), by(id_sm)
gen     index_04012024 = (price / base_price_04012024) * 100

drop base_price_23032023 base_price_26032023 base_price_17042023 base_price_04012024  

// Rename variable ecoicop and identify the ECOICOP 4-digit classification.
rename ecoicop ecoicop5

gen ecoicop4 = .
replace ecoicop4 = 0111 if ecoicop5 == 01111 | ecoicop5 == 01113 | ecoicop5 == 01116
replace ecoicop4 = 0112 if ecoicop5 == 01121 | ecoicop5 == 01122 | ecoicop5 == 01124
replace ecoicop4 = 0113 if ecoicop5 == 01131 | ecoicop5 == 01132 
replace ecoicop4 = 0114 if ecoicop5 == 01141 | ecoicop5 == 01142 | ecoicop5 == 01144 | ///
			   ecoicop5 == 01145 | ecoicop5 == 01146 | ecoicop5 == 01147
replace ecoicop4 = 0115 if ecoicop5 == 01151 | ecoicop5 == 01153
replace ecoicop4 = 0116 if ecoicop5 == 01161
replace ecoicop4 = 0117 if ecoicop5 == 01171 | ecoicop5 == 01172 | ecoicop5 == 01174

// Identify products imported (0) and produced in Portugal (1).
gen domestic     = 1 if real(substr(string(ean13, "%14.0f"), 1, 3)) == 560
replace domestic = 0 if (real(substr(string(ean13, "%14.0f"), 1, 3)) != 560 & ean13 != .)

drop ean13

// Identify products from trademark-labels (0) and white-labels (1).
gen white_label = 0
foreach expr in "continente" "auchan" "pingo" "aldi" {
	replace white_label = 1 if strpos(brand, "`expr'") > 0
	}

drop brand

// Identify products from small supermarkets (0) and big supermarkets (1).
gen big = 1 if supermarket == "CONTINENTE" | supermarket == "PINGODOCE"
replace big = 0 if big == .

save "${path_work}/auxiliar/tempfile.dta", replace

// Identify products with price below average (0) and above average (1).
keep if date >= td(01/01/2023) & date < td(01/03/2023)
egen average_price = mean(price), by(id_sm)
keep id_sm average_price
duplicates drop id_sm, force
save "${path_work}/auxiliar/average_price.dta", replace

use "${path_work}/auxiliar/tempfile.dta"

merge m:1 id_sm using "${path_work}/auxiliar/average_price.dta"
drop _merge
sum average_price if treatment == 1, d
return list
gen high_price = 1 if average_price >= `r(mean)' 
replace high_price = 0 if average_price < `r(mean)'

erase "${path_work}/auxiliar/tempfile.dta"

*********************************************************

// (2) Flag products that were not certain to be treated (c.f. Portal das Finanças and Excel VAT_Doubts)

// Generate a new variable 'vat_doubt' and initialize it to missing values.
gen vat_doubt = .

// Loop through each product in the 'doubt' global list.
foreach expr in nabo couve carapau sangue meloa melancia sumo nectar vitelo vitela vitelao novilha novilho boi leitao bacalhau barrigas linguas cabecas caras abas galinha carcaca carcaça espetada espetadas hamburguer tostas panado panada nugget requeijao {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

// Loop through specific products to be flagged for further investigation.
foreach expr in "massa instantanea" "tomate seco" "tomate desidratado" "arroz vaporizado" "batata doce" "leite infantil" "leite crianca" "batata congelada" "cebola chalota" "ervilha torta" "manteiga alho" "manteiga ervas" "manteiga amendoim" "conserva atum tomate" "conserva atum escabeche" "conserva atum tomate" "conserva atum algarvia" "conserva atum picante" "conserva atum caldeirada" "conserva peixe" "mini tomate cherry" "arroz basmati" "arroz jasmim" "rolo de carne" {
    // Set 'vat_doubt' to 1 for rows where the product name contains the current product in the loop.
    replace vat_doubt = 1 if strpos(name, "`expr'") > 0
}

*********************************************************

// (3) Correct measurement errors.

// Change treatment == 1 as these are measurement errors: products classified as control
// in a COICOP 5 that has only treated products.
replace treatment = 1 if ecoicop5 == 1111 | ecoicop5 == 1113 | ecoicop5 == 1121 | ///
			 ecoicop5 == 1122 | ecoicop5 == 1135 | ecoicop5 == 1141 | ///
			 ecoicop5 == 1142 | ecoicop5 == 1144 | ecoicop5 == 1145 | ///
			 ecoicop5 == 1147 | ecoicop5 == 1151 | ecoicop5 == 1153 | ///
			 ecoicop5 == 1154
		
replace treatment = 0 if ecoicop5 == 01112 | ecoicop5 == 01115 | ecoicop5 == 01117 | ///
			 ecoicop5 == 01123 | ecoicop5 == 01125 | ecoicop5 == 01126 | ///
			 ecoicop5 == 01127 | ecoicop5 == 01133 | ecoicop5 == 01134 | ///
			 ecoicop5 == 01143 | ecoicop5 == 01152 | ecoicop5 == 01155 | ///
			 ecoicop5 == 01162 | ecoicop5 == 01163 | ecoicop5 == 01164 | ///
			 ecoicop5 == 01175 | ecoicop5 == 01118 | ecoicop5 == 01182 | ///
			 ecoicop5 == 01184 | ecoicop5 == 01185 | ecoicop5 == 01186 | ///
			 ecoicop5 == 01191 | ecoicop5 == 01192 | ecoicop5 == 01193 | ///
			 ecoicop5 == 01194 | ecoicop5 == 01195 | ecoicop5 == 01211 | ///
			 ecoicop5 == 01212 | ecoicop5 == 01213 | ecoicop5 == 01221 | ///
			 ecoicop5 == 01222 | ecoicop5 == 01223 | ecoicop5 == 02111 | ///
			 ecoicop5 == 02112 | ecoicop5 == 02121 | ecoicop5 == 02122 | ///
			 ecoicop5 == 02123 | ecoicop5 == 02124 | ecoicop5 == 02131 | ///
			 ecoicop5 == 02132 | ecoicop5 == 02133 | ecoicop5 == 02134 | ///
			 ecoicop5 == 01181 | ecoicop5 == 01183

*********************************************************

// (4) Add usable time variable to the dataset and set it as a panel.


// Merge datasets based on the 'date' variable using the "weeks.dta" file.
merge m:1 date using "${path_work}/source/weeks.dta"
drop _merge

gen year = year(date)

// Generate a new variable 'week_year' representing the combination of year and week.
gen week_year = yw(year(date), week_number)

// Drop the original 'week' variable.
drop week_number

// Format 'week_year' as a Stata weekly date.
format week_year %tw

// Create a new variable 'month_year' representing the combination of year and month.
gen month_year = ym(year(date), month(date))

// Format 'month_year' as a Stata monthly date.
format month_year %tm

// Create a new variable 'day' with the day of the observation.
generate day = day(date)

// (5) Add weights variable.

// Merge datasets based on 'ecoicop' and 'treatment'.
merge m:1 ecoicop5 treatment using "${path_work}/source/ecoicop_categories.dta"
bys ecoicop5 treatment date: gen frequency = _N
gen weight = weight_2023 / frequency
replace weight = 0 if _merge == 1
drop _merge

// Set the data as a panel with daily observations.
xtset id_sm date, daily

erase "${path_work}/auxiliar/average_price.dta"

*********************************************************

// Exclude observations before January 1, 2023.
keep if date >= td(01/01/2023)

// Exclude treated products that raised doubts.
drop if vat_doubt == 1

// Exclude treated products with a decrease of 23%.
drop if (treatment == 1 & ecoicop5 == 01154)

// Exclude outlier observations.
drop if supermarket == "PINGODOCE" & (day == 1 | day == 2 | day == 31 | day == 30 | day == 28 | day == 29 | day == 27)

* Exclude all drinks.
drop if ecoicop5 == 01211 | ecoicop5 == 01212 | ecoicop5 == 01213 | ///
	ecoicop5 == 01221 | ecoicop5 == 01222 | ecoicop5 == 01223 | ///
	ecoicop5 == 02111 | ecoicop5 == 02112 | ecoicop5 == 02121 | ///
	ecoicop5 == 02122 | ecoicop5 == 02123 | ecoicop5 == 02124 | ///
	ecoicop5 == 02131 | ecoicop5 == 02132 | ecoicop5 == 02133 | ///
	ecoicop5 == 02134

// Save the updated dataset.
save "${path_work}/auxiliar/constant_basket.dta", replace

*********************************************************

*********************************************************
*                Event-study Regression                 *
*********************************************************

local vce_cluster ${vce_cluster}
local weights ${weights}
local variable ${variable}
local aggregation ${aggregation}

file open  myfile using "${results_tables}/table_c3.txt", write text replace
file write myfile "Robustness, Description, Implementation, Reversal" _n _n
file close myfile

*********************************************************


* Data Imputation (1)
*********************************************************
	
// Load the cleaned dataset.
use "${path_work}/auxiliar/imputation_1.dta", clear

collapse (${aggregation}) ${variable} (first) ecoicop5 treatment weight, by (id_sm five_day_number)

// Compute a price index with base in the day before policy announcement I.
local   base_date 22
egen    base_price_23032023 = total(${variable}/ (five_day_number == `base_date')), by(id_sm)
gen     index_23032023 = (${variable}/ base_price_23032023) * 100

xtset id_sm five_day_number

preserve

// Keep the data around the event.
keep if five_day_number >= 17
keep if five_day_number <= 32

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 23

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_implementation = results[1,10]
scalar se_implementation   = sqrt(vcv[10,10])

restore

preserve

// Keep the data around the event.
keep if five_day_number >= 60
keep if five_day_number <= 83

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 66

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_reversal       = results[1,19]
scalar se_reversal         = sqrt(vcv[19,19])

file open  myfile using "${results_tables}/table_c3.txt", write text append

generate coeff_implementation = coef_implementation
generate see_implementation = se_implementation
generate coeff_reversal = coef_reversal
generate see_reversal = se_reversal

format %12.0g coeff_implementation see_implementation coeff_reversal see_reversal
file write myfile "1, Data Imputation (1), " (coeff_implementation) ", " (coeff_reversal) _n ", , (" (see_implementation) "), (" (see_reversal) ")" _n _n 
drop coeff_implementation see_implementation coeff_reversal see_reversal

file close myfile

restore

erase "${path_work}/auxiliar/imputation_1.dta"

*********************************************************


* Data Imputation (2)
*********************************************************
	
// Load the cleaned dataset.
use "${path_work}/auxiliar/imputation_2.dta", clear

collapse (${aggregation}) ${variable} (first) ecoicop5 treatment weight, by (id_sm five_day_number)

// Compute a price index with base in the day before policy announcement I.
local   base_date 22
egen    base_price_23032023 = total(${variable}/ (five_day_number == `base_date')), by(id_sm)
gen     index_23032023 = (${variable}/ base_price_23032023) * 100

xtset id_sm five_day_number

preserve

// Keep the data around the event.
keep if five_day_number >= 17
keep if five_day_number <= 32

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 23

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_implementation = results[1,10]
scalar se_implementation   = sqrt(vcv[10,10])

restore

preserve

// Keep the data around the event.
keep if five_day_number >= 60
keep if five_day_number <= 83

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 66

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_reversal       = results[1,19]
scalar se_reversal         = sqrt(vcv[19,19])

file open  myfile using "${results_tables}/table_c3.txt", write text append

generate coeff_implementation = coef_implementation
generate see_implementation = se_implementation
generate coeff_reversal = coef_reversal
generate see_reversal = se_reversal

format %12.0g coeff_implementation see_implementation coeff_reversal see_reversal
file write myfile "2, Data Imputation (2), " (coeff_implementation) ", " (coeff_reversal) _n ", , (" (see_implementation) "), (" (see_reversal) ")" _n _n 
drop coeff_implementation see_implementation coeff_reversal see_reversal

file close myfile

restore

erase "${path_work}/auxiliar/imputation_2.dta"

*********************************************************


* Balanced Panel
*********************************************************
	
// Load the cleaned dataset.
use "${path_work}/auxiliar/constant_basket.dta", clear

collapse (${aggregation}) ${variable} (first) ecoicop5 treatment weight, by (id_sm five_day_number)

// Compute a price index with base in the day before policy announcement I.
local   base_date 22
egen    base_price_23032023 = total(${variable}/ (five_day_number == `base_date')), by(id_sm)
gen     index_23032023 = (${variable}/ base_price_23032023) * 100

xtset id_sm five_day_number

preserve

// Keep the data around the event.
keep if five_day_number >= 17
keep if five_day_number <= 32

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 23

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} ${window_high}) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_implementation = results[1,10]
scalar se_implementation   = sqrt(vcv[10,10])

restore

preserve

// Keep the data around the event.
keep if five_day_number >= 60
keep if five_day_number <= 83

// Policy variable: 1 after the reference date.
replace treatment = 0 if five_day_number < 66

if ${weights} == 0 {

// Run the event-study regression.
xtevent index_23032023, policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
	
}

if ${weights} == 1 {

// Run the event-study regression.
xtevent index_23032023 [pweight = weight], policyvar(treatment) panelvar(id_sm) timevar(five_day_number) window(${window_low} 16) vce(cluster ${vce_cluster}) impute(nuchange) reghdfe trend(${trend}, method(gmm) saveoverlay)
		
}

matrix results = r(delta)
matrix vcv = r(Vdelta)

scalar coef_reversal       = results[1,19]
scalar se_reversal         = sqrt(vcv[19,19])

file open  myfile using "${results_tables}/table_c3.txt", write text append

generate coeff_implementation = coef_implementation
generate see_implementation = se_implementation
generate coeff_reversal = coef_reversal
generate see_reversal = se_reversal

format %12.0g coeff_implementation see_implementation coeff_reversal see_reversal
file write myfile "3, Balanced Panel, " (coeff_implementation) ", " (coeff_reversal) _n ", , (" (see_implementation) "), (" (see_reversal) ")" _n _n 
drop coeff_implementation see_implementation coeff_reversal see_reversal

file close myfile

restore

erase "${path_work}/auxiliar/constant_basket.dta"

*********************************************************
